*** Creation of world data file
drop _all
set more off
set memory 5g
set matsize 2000
use world_child.dta
capture log close
set logtype text
log using world_child3create, replace

* we need to drop observations that have missing values for mother's height
bys id2: tab ht_miss
** need to drop surveys for which NO heights were recorded.
drop if id2=="ia1" | id2=="br1" | id2=="br2" | id2=="cm1" | id2=="co1" | id2=="co2" | id2=="dr1" | id2=="dr2"| id2=="eg1" | id2=="gh1" | id2=="ke2" | id2=="ma1" | id2=="md1" | id2=="ml1" | id2=="ng2" | id2=="nm2" | id2=="rw1" | id2=="sn1" | id2=="sn4" | id2=="tg1" | id2=="ug1" | id2=="zw1" | id2=="za1" | id2=="ye1" | id2=="vn2" | id2=="vn2" | id2=="vn1" | id2=="tt1" | id2=="tn1" | id2=="th1" | id2=="sd1" | id2=="py1" | id2=="pk1" | id2=="ph3" | id2=="ph2" | id2=="ph1" | id2=="mx1" | id2=="lk1" | id2=="lb1" | id2=="id5" | id2=="id4" | id2=="id3" | id2=="id2" | id2=="id1" | id2=="gu1" | id2=="ec1" | id2=="bu1" | id2=="bo1" | id2=="bd3"

tab ht_miss

* generate var for partner's education in years - this is v715. It is already in the Indian file.
rename v715 educmyrs
replace educmyrs=. if educmyrs>30

* drop India and then re-merge: note: educm is coded differently for India file - we have educm1-educm4, rather than educm1-educm6 - will fix this before merging in India

drop if country=="India"

drop educm1-educm6
gen educm1=(educm==0)
gen educm2=(educm>0 & educm<3)
gen educm3=(educm>2 & educm<5)
gen educm4=(educm==5)

* recode var educm so that it is the same for India and World
replace educm=1 if educm==2
replace educm=2 if educm==3 | educm==4
replace educm=3 if educm==5

label define temp1 0 "No Education" 1 "Primary Education" 2 "Secondary Education" 3 "Higher"
label values educm temp1

append using /shared/research/srbmedia/Sam/Heights/India/child-mortality/india_child_123

drop if id2=="ia1"
replace country="India" if country==""

* cut off age for mothers is 21
drop if agem < 21

* recode educf so that it is the same as educm
drop educf1-educf6
gen educf1=(educf==0)
gen educf2=(educf>0 & educf<3)
gen educf3=(educf>2 & educf<5)
gen educf4=(educf==5)

replace educf=1 if educf==2
replace educf=2 if educf==3 | educf==4
replace educf=3 if educf==5

label define temp2 0 "No Education" 1 "Primary Education" 2 "Secondary Education" 3 "Higher"
label values educf temp2

drop lheight
gen lheight=ln(height)
label var lheight "Log Height of Respondent"

* limit sample to 1970 - 2000

drop if yearc<1970 | yearc>2000

* some vars are created for india file - we need to drop these before recreating

drop heightc
drop cbirthmth*
forvalues i=1/12 {
	generate cbirthmth`i'=(monthc==`i')
	}

drop chld*
gen chld1=(bord==1)
gen chld2=(bord==2)
gen chld3=(bord==3)
gen chldm4=(bord>3)

bys caseid2: gen no=_n
bys caseid2: egen max_no=max(no)
gen sib1=.
gen sib2=.
gen sib3=.
gen sibm3=.
replace sib1=(max_no==2) 
replace sib2=(max_no==3) 
replace sib3=(max_no==4) 
replace sibm3=(max_no>4) 

tab sib1
tab sib2 
tab sib3
tab sibm3


** implausible values for age at birth - will drop these, then create age dummies
drop if agemay<9
drop age915 age1618 age1924 age2530 age3149

gen age915=(agemay>8 & agemay<16)
gen age1618=(agemay>15 & agemay<19)
gen age1924=(agemay>18 & agemay<25)
gen age2530=(agemay>24 & agemay<31)
gen age3149=(agemay>30 & agemay<50)

rename religion religion_india
rename v130 religion

tab religion
** going to categorise religion into christian, muslim, norel and other

drop muslim hindu otherrel religionmiss 

gen christian=(religion==1)
gen muslim=(religion==2)
gen norel=(religion==7)
gen otherrel=(religion!=1 & religion!=2 & religion!=7 & religion!=96 & religion!=.)
gen religionmiss=(religion==96 | religion==.)

replace christian=1 if religion_india==3 & country=="India"
replace muslim=1 if religion_india==2 & country=="India"
replace otherrel=1 if religion_india==1 | religion_india==4 | religion_india==5 & country=="India"
replace religionmiss=1 if religion_india==. & country=="India"

** finally, generate dummy for rural
rename v102 residence
gen urban=(residence==1)
label var urban "1 if urban resident"

** I am going to get rid of Armenia, Uzbekistan, Kazakhstan, Kurz Republic, Bolivia, Guatelama , Bangladesh, and Congo as very few obs

drop if country=="Armenia" | country=="Uzbekistan" | country=="Kazakhstan" | country=="Kyrgyz Republic" | country=="Bolivia" | country=="Guatemala" | country=="Bangladesh" | country=="Congo" | country=="Nigeria" | country=="Moldova"
replace country="Cote d'Ivoire" if country=="Cote d`Ivoire"
tab country

* create ethnicity variable for missing
replace ethnicity=99 if ethnicity==.
* tab height and then check for implausible values
tab height
tab ht_miss
* implausibly high values which are obviously errors with the survey - will drop these now
drop if height>200 & height<.
tab height
tab ht_miss

** want to drop heights which are 3.s.d away from the mean, specific to each country - given by tall3 and short3

tab yearc
gen cohort1=(yearc<1980)
gen cohort2=(yearc>1979 & yearc<1990)
gen cohort3=(yearc>1989 & yearc<2001)

* generate cohorts for children (i.e. decades) - may change this at a later date 
gen yearc7079=(yearc==1970 | yearc==1971 | yearc==1972 | yearc==1973 | yearc==1974 | yearc==1975 | yearc==1976 | yearc==1977 | yearc==1978 | yearc==1979)
gen yearc8089=(yearc==1980 | yearc==1981 | yearc==1982 | yearc==1983 | yearc==198 | yearc==1985 | yearc==1986 | yearc==1987 | yearc==1988 | yearc==1989)
gen yearc9000=(yearc==1990 | yearc==1991 | yearc==1992 | yearc==1993 | yearc==1994 | yearc==1995 | yearc==1996 | yearc==1997 | yearc==1998 | yearc==1999 | yearc==2000)
label var yearc7079 "Child born between 1970-1979"
label var yearc8089 "Child born between 1970-1979"
label var yearc9000 "Child born between 1990-2000"

** create countryid
tab country
sort country
egen countryid=group(country)
tab countryid
tab country
save world_child2, replace
preserve

** merge in GDP
use penn.dta, clear
drop if country=="Nigeria"
drop countryid
egen countryid=group(country)
tab countryid

tab lgdp
sort countryid yearc
save penn.dta, replace

drop if gdp==.
sort countryid
collapse (mean) gdp, by(countryid)

egen median=median(gdp)
gen above_med=(gdp>median)

keep countryid above_med
sort countryid
save median_merge, replace

restore
sort countryid
merge countryid using median_merge, _merge(median_merge)

tab median_merge
drop median_merge

sort countryid yearc
merge countryid yearc using penn.dta, _merge(gdp_merge)

tab gdp_merge
drop gdp_merge

list countryid yearc gdp in 1/5

gen continent1=(continent==1)
gen continent2=(continent==2)
gen continent3=(continent==3)

** keep only variables I will need
keep infant* lheight* height* tall* short* urban ht_miss malec cbirthmth* chld* age* educf* educm* religion* christian muslim otherrel country* countryid gdp* sweight year* id2 caseid* above* lgdp* elec radio tele fridge bike motorbike car v153 toilet_flush toilet_pit sib* bord hw3 neo* v150 b6 b7 under5 m18 m19 twinc v104

rename b7 age_deathmths
rename b6 age_death

** generate dummies for later
foreach num of numlist 1970/1997 {
	gen yearc`num'=(yearc==`num')
	}

foreach num of numlist 1/39 {
	gen country`num'=(countryid==`num')
	}
sort yearm
save world_child2, replace

** merge in mother imr etc
tab yearm
sort countryid yearm
collapse (mean) infant [pw=sweight], by(countryid yearm)
rename infant infantm
sort countryid yearm 
save temp1, replace
use yearm-gdp, clear
drop if country=="Nigeria"
drop countryid
egen countryid=group(country)
sort countryid yearm
save yearm-gdp, replace

use world_child2
sort countryid yearm
merge countryid yearm using temp1
tab _merge
drop _merge
sort countryid yearm
merge countryid yearm using yearm-gdp
tab _merge
drop _merge

compress

save world_child2, replace

* want to impute heights; need to have one obs per mother, and then merge back in the imputed heights
* drop old height vars

foreach var of varlist tall1 tall2 short1 short2 tall3 short3 tallhalf shorthalf {
rename `var' `var'_noimputed
}

preserve
bys caseid2: gen number=_n
keep if number==1

xi: uvis regress height urban malec cbirthmth2-cbirthmth12 sib1 sib2 sib3 sibm3 age915 age1618 age2530 age3149 educf2-educf4 educm2-educm4 christian muslim otherrel i.country*i.yearc, seed(1001) boot gen(imputed_height)

** now want to recreate tall1, tall2 etc

rename height_mean height_mean_old
rename height_sd height_sd_old
bys country: egen height_mean=mean(imputed_height)
bys country: egen height_sd=sd(imputed_height)

bys country: gen tall1=(imputed_height > height_mean + height_sd & imputed_height < height_mean + 2*height_sd)
bys country: gen tall2=(imputed_height > height_mean + 2*height_sd)
bys country: gen short1=(imputed_height < height_mean - height_sd & imputed_height > height_mean - 2*height_sd)
bys country: gen short2=(imputed_height < height_mean - 2*height_sd)
bys country: gen tallhalf=(imputed_height > height_mean + 0.5*height_sd & imputed_height < height_mean + height_sd)
bys country: gen shorthalf=(imputed_height < height_mean - 0.5*height_sd & imputed_height > height_mean - height_sd)
bys country: gen tall3=(imputed_height > height_mean + 3*height_sd & imputed_height!=.)
bys country: gen short3=(imputed_height < height_mean - 3*height_sd)

keep caseid2 imputed_height height_mean height_sd tall1 tall2 short1 short2 tallhalf shorthalf tall3 short3
sort caseid2
save temp-caseid2-ht.dta, replace

restore

sort caseid2
merge caseid2 using temp-caseid2-ht.dta, _merge(caseid2merge)
tab caseid2merge

tab tall1, missing

drop if tall3==1 | short3==1

sum height imputed_height

** now create interactions of ht vars

foreach var of varlist tall1 tall2 short1 short2 tallhalf shorthalf {
	gen `var'_lgdp=`var'*lgdp
	}

rename lheight lheight_old
label var lheight_old "Log Height - not including imputed heights"
gen lheight=ln(imputed_height)
label var lheight "Log Height of Respondent - imputed"

save world_child3, replace

log close
exit

